Data Aquisition, Loading, and Composition

Load in the data that was cleaned and combined from different data sets. This work can be seen in the Appendix.

data <- read.csv("data/combined_zillow_project_data.csv")

EDA

library(tigris)
library(sf)
library(dplyr)
library(ggplot2)

Base map construction for contiguous US

# Load counties but exclude Alaska, Hawaii, and U.S. territories
us_counties_sf <- counties(cb = TRUE, class = "sf") %>%
  filter(!substr(GEOID, 1, 2) %in% c("02", "15", "60", "66", "69", "72", "78"))
## Retrieving data for the year 2022
##   |                                                                              |                                                                      |   0%  |                                                                              |                                                                      |   1%  |                                                                              |=                                                                     |   1%  |                                                                              |=                                                                     |   2%  |                                                                              |==                                                                    |   2%  |                                                                              |==                                                                    |   3%  |                                                                              |==                                                                    |   4%  |                                                                              |===                                                                   |   4%  |                                                                              |===                                                                   |   5%  |                                                                              |====                                                                  |   5%  |                                                                              |====                                                                  |   6%  |                                                                              |=====                                                                 |   6%  |                                                                              |=====                                                                 |   7%  |                                                                              |=====                                                                 |   8%  |                                                                              |======                                                                |   8%  |                                                                              |======                                                                |   9%  |                                                                              |=======                                                               |   9%  |                                                                              |=======                                                               |  10%  |                                                                              |=======                                                               |  11%  |                                                                              |========                                                              |  11%  |                                                                              |========                                                              |  12%  |                                                                              |=========                                                             |  12%  |                                                                              |=========                                                             |  13%  |                                                                              |==========                                                            |  14%  |                                                                              |==========                                                            |  15%  |                                                                              |===========                                                           |  15%  |                                                                              |===========                                                           |  16%  |                                                                              |============                                                          |  17%  |                                                                              |============                                                          |  18%  |                                                                              |=============                                                         |  18%  |                                                                              |=============                                                         |  19%  |                                                                              |==============                                                        |  19%  |                                                                              |==============                                                        |  20%  |                                                                              |==============                                                        |  21%  |                                                                              |===============                                                       |  21%  |                                                                              |===============                                                       |  22%  |                                                                              |================                                                      |  22%  |                                                                              |================                                                      |  23%  |                                                                              |================                                                      |  24%  |                                                                              |=================                                                     |  24%  |                                                                              |=================                                                     |  25%  |                                                                              |==================                                                    |  25%  |                                                                              |==================                                                    |  26%  |                                                                              |===================                                                   |  26%  |                                                                              |===================                                                   |  27%  |                                                                              |===================                                                   |  28%  |                                                                              |====================                                                  |  28%  |                                                                              |====================                                                  |  29%  |                                                                              |=====================                                                 |  29%  |                                                                              |=====================                                                 |  30%  |                                                                              |=====================                                                 |  31%  |                                                                              |======================                                                |  31%  |                                                                              |======================                                                |  32%  |                                                                              |=======================                                               |  32%  |                                                                              |=======================                                               |  33%  |                                                                              |=======================                                               |  34%  |                                                                              |========================                                              |  34%  |                                                                              |========================                                              |  35%  |                                                                              |=========================                                             |  35%  |                                                                              |=========================                                             |  36%  |                                                                              |==========================                                            |  36%  |                                                                              |==========================                                            |  37%  |                                                                              |==========================                                            |  38%  |                                                                              |===========================                                           |  38%  |                                                                              |===========================                                           |  39%  |                                                                              |============================                                          |  39%  |                                                                              |============================                                          |  40%  |                                                                              |============================                                          |  41%  |                                                                              |=============================                                         |  41%  |                                                                              |=============================                                         |  42%  |                                                                              |==============================                                        |  42%  |                                                                              |==============================                                        |  43%  |                                                                              |===============================                                       |  44%  |                                                                              |===============================                                       |  45%  |                                                                              |================================                                      |  45%  |                                                                              |================================                                      |  46%  |                                                                              |=================================                                     |  47%  |                                                                              |=================================                                     |  48%  |                                                                              |==================================                                    |  48%  |                                                                              |==================================                                    |  49%  |                                                                              |===================================                                   |  49%  |                                                                              |===================================                                   |  50%  |                                                                              |===================================                                   |  51%  |                                                                              |====================================                                  |  51%  |                                                                              |====================================                                  |  52%  |                                                                              |=====================================                                 |  52%  |                                                                              |=====================================                                 |  53%  |                                                                              |=====================================                                 |  54%  |                                                                              |======================================                                |  54%  |                                                                              |======================================                                |  55%  |                                                                              |=======================================                               |  55%  |                                                                              |=======================================                               |  56%  |                                                                              |========================================                              |  56%  |                                                                              |========================================                              |  57%  |                                                                              |========================================                              |  58%  |                                                                              |=========================================                             |  58%  |                                                                              |=========================================                             |  59%  |                                                                              |==========================================                            |  59%  |                                                                              |==========================================                            |  60%  |                                                                              |==========================================                            |  61%  |                                                                              |===========================================                           |  61%  |                                                                              |===========================================                           |  62%  |                                                                              |============================================                          |  62%  |                                                                              |============================================                          |  63%  |                                                                              |============================================                          |  64%  |                                                                              |=============================================                         |  64%  |                                                                              |=============================================                         |  65%  |                                                                              |==============================================                        |  65%  |                                                                              |==============================================                        |  66%  |                                                                              |===============================================                       |  66%  |                                                                              |===============================================                       |  67%  |                                                                              |===============================================                       |  68%  |                                                                              |================================================                      |  68%  |                                                                              |================================================                      |  69%  |                                                                              |=================================================                     |  69%  |                                                                              |=================================================                     |  70%  |                                                                              |=================================================                     |  71%  |                                                                              |==================================================                    |  71%  |                                                                              |==================================================                    |  72%  |                                                                              |===================================================                   |  72%  |                                                                              |===================================================                   |  73%  |                                                                              |====================================================                  |  74%  |                                                                              |====================================================                  |  75%  |                                                                              |=====================================================                 |  75%  |                                                                              |=====================================================                 |  76%  |                                                                              |======================================================                |  77%  |                                                                              |======================================================                |  78%  |                                                                              |=======================================================               |  78%  |                                                                              |=======================================================               |  79%  |                                                                              |========================================================              |  79%  |                                                                              |========================================================              |  80%  |                                                                              |========================================================              |  81%  |                                                                              |=========================================================             |  81%  |                                                                              |=========================================================             |  82%  |                                                                              |==========================================================            |  82%  |                                                                              |==========================================================            |  83%  |                                                                              |==========================================================            |  84%  |                                                                              |===========================================================           |  84%  |                                                                              |===========================================================           |  85%  |                                                                              |============================================================          |  85%  |                                                                              |============================================================          |  86%  |                                                                              |=============================================================         |  87%  |                                                                              |=============================================================         |  88%  |                                                                              |==============================================================        |  88%  |                                                                              |==============================================================        |  89%  |                                                                              |===============================================================       |  89%  |                                                                              |===============================================================       |  90%  |                                                                              |===============================================================       |  91%  |                                                                              |================================================================      |  91%  |                                                                              |================================================================      |  92%  |                                                                              |=================================================================     |  92%  |                                                                              |=================================================================     |  93%  |                                                                              |=================================================================     |  94%  |                                                                              |==================================================================    |  94%  |                                                                              |==================================================================    |  95%  |                                                                              |===================================================================   |  95%  |                                                                              |===================================================================   |  96%  |                                                                              |====================================================================  |  97%  |                                                                              |====================================================================  |  98%  |                                                                              |===================================================================== |  98%  |                                                                              |===================================================================== |  99%  |                                                                              |======================================================================|  99%  |                                                                              |======================================================================| 100%
# Plot only the mainland U.S.
ggplot(us_counties_sf) +
  geom_sf(fill = "lightgray", color = "black") +
  coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +  # Crop to mainland
  ggtitle("Mainland U.S. Counties (Excluding AK, HI, & Territories)") +
  theme_minimal()

Aggregate by county

# Aggregate by county and calculate the average of several columns together
county_aggregated_df <- data %>%
  group_by(CountyName, GEOID, StateName, lat, lng) %>%
  summarise(
    
    avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
    avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
    avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
    avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
    avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
    avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
    avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
    avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
    avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
    avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
    avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
    
    avg_population = mean(population, na.rm = TRUE),
    
    avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
    avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
    avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
    avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
    avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
    avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
    avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
    avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
    avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
    
    avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
    avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
    avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
    avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
    avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
    avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
    avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
    avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
    avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
    
    avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
    avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
    avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
    avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
    avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
    avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
    avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
    avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
    avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE),
  )
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'StateName', 'lat'.
## You can override using the `.groups` argument.
county_aggregated_df$GEOID <- as.character(county_aggregated_df$GEOID)
# List of potential values
valid_categories <- c("AL", "AK", "AZ", "AR", "CA", "CO", "CT")
county_aggregated_df <- county_aggregated_df %>%
  mutate(
    GEOID = case_when(
      StateName %in% valid_categories ~ str_pad(GEOID, width = max(nchar(GEOID)) + 1, side = "left", pad = "0"),
      TRUE ~ GEOID  # Default case (for rows not matching the condition)
    )
  )

map_data <- us_counties_sf %>%
  left_join(county_aggregated_df, by = "GEOID")

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
  ggtitle("Average Historical Rent Price (2015-2024) by County (Mainland U.S.)") +
  theme_minimal()

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_median_income), color = "black") +
  scale_fill_viridis_c(name = "Average Median Income ($)") +
  coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
  ggtitle("Average Historical Median Income (2015-2022) by County (Mainland U.S.)") +
  theme_minimal()

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_property_taxes), color = "black") +
  scale_fill_viridis_c(name = "Average Property Taxes ($)") +
  coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
  ggtitle("Average Historical Property Taxes (2015-2022) by County (Mainland U.S.)") +
  theme_minimal()

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_median_age), color = "black") +
  scale_fill_viridis_c(name = "Average Median Age (years)") +
  coord_sf(xlim = c(-125, -66), ylim = c(24, 50), expand = FALSE) +
  ggtitle("Average Historical Median Age (2015-2022) by County (Mainland U.S.)") +
  theme_minimal()

Correlation on a Historic Basis for the United States as a Whole

Examine correlations between historical rent prices and population sizes, median income, median age, and property taxes:

cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_population)
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_historical_range_median_income)
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_historical_range_median_age)
cor(county_aggregated_df$avg_historical_range_rent, county_aggregated_df$avg_historical_range_property_taxes)

The strongest correlations are between rent prices and the median income (0.43) and the property taxes (0.409)

State Level Analysis

Pick States to Zoom in On

table(data$StateName)
## 
##  AK  AL  AR  AZ  CA  CO  DC  DE  FL  GA  HI  IA  ID  IL  IN  KS  KY  LA  MA  MD 
##   2  54  25  49 396  64   1   9 279 112  18  29  22 118  50  25  28  35 104  67 
##  ME  MI  MN  MO  MS  MT  NC  ND  NE  NH  NJ  NM  NV  NY  OH  OK  OR  PA  RI  SC 
##  14  88  62  37  26  10 108  10  12  16 166  13  12 127 116  38  57 148  21  58 
##  SD  TN  TX  UT  VA  VT  WA  WI  WV  WY 
##   7  59 251  49  67   2 102  47   9   8

States with the most data points to zoom in on: California, Florida, and Texas

CA_data <- data %>%
  filter(StateName == 'CA')

FL_data <- data %>%
  filter(StateName == 'FL')

TX_data <- data %>%
  filter(StateName == 'TX')

California

CA_county_aggregated_df <- CA_data %>%
  group_by(CountyName, GEOID, lat, lng) %>%
  summarise(
    
    avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
    avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
    avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
    avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
    avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
    avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
    avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
    avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
    avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
    avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
    avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
    
    avg_population = mean(population, na.rm = TRUE),
    
    avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
    avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
    avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
    avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
    avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
    avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
    avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
    avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
    avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
    
    avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
    avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
    avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
    avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
    avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
    avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
    avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
    avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
    avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
    
    avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
    avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
    avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
    avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
    avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
    avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
    avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
    avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
    avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'lat'. You can
## override using the `.groups` argument.
state_fips <- "06"

counties_sf <- counties(state = state_fips, cb = TRUE, class = "sf") 
## Retrieving data for the year 2022
CA_county_aggregated_df$GEOID <- as.character(CA_county_aggregated_df$GEOID)
CA_county_aggregated_df <- CA_county_aggregated_df %>%
  mutate(GEOID = str_pad(GEOID, width = max(nchar(GEOID)) + 1, side = "left", pad = "0"))

map_data <- counties_sf %>%
  left_join(CA_county_aggregated_df, by = "GEOID")

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  ggtitle("Average Historical Rent Price (2015-2025) by County (Mainland U.S.)") +
  theme_minimal()

ggplot(map_data) +
  geom_sf(aes(fill = avg_2024_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  ggtitle("Average Rent Price in 2024 by County (California)") +
  theme_minimal()

Florida

FL_county_aggregated_df <- FL_data %>%
  group_by(CountyName, GEOID, lat, lng) %>%
  summarise(
    
    avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
    avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
    avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
    avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
    avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
    avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
    avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
    avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
    avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
    avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
    avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
    
    avg_population = mean(population, na.rm = TRUE),
    
    avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
    avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
    avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
    avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
    avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
    avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
    avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
    avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
    avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
    
    avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
    avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
    avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
    avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
    avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
    avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
    avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
    avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
    avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
    
    avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
    avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
    avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
    avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
    avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
    avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
    avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
    avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
    avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'lat'. You can
## override using the `.groups` argument.
state_fips <- "12"

counties_sf <- counties(state = state_fips, cb = TRUE, class = "sf") 
## Retrieving data for the year 2022
FL_county_aggregated_df$GEOID <- as.character(FL_county_aggregated_df$GEOID)

map_data <- counties_sf %>%
  left_join(FL_county_aggregated_df, by = "GEOID")

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  ggtitle("Average Historical Rent Price (2015-2024) by County (Florida)") +
  theme_minimal()

ggplot(map_data) +
  geom_sf(aes(fill = avg_2024_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  ggtitle("Average Rent Price in 2024 by County (Florida)") +
  theme_minimal()

Texas

TX_county_aggregated_df <- TX_data %>%
  group_by(CountyName, GEOID, lat, lng) %>%
  summarise(
    
    avg_historical_range_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31, X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31, X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31, X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31, X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31, X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31, X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31, X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31, X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31, X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31, X2025.01.31), na.rm = TRUE),
    avg_2024_rent = mean(c(X2024.01.31, X2024.02.29, X2024.03.31, X2024.04.30, X2024.05.31, X2024.06.30, X2024.07.31, X2024.08.31, X2024.09.30, X2024.10.31, X2024.11.30, X2024.12.31), na.rm = TRUE),
    avg_2023_rent = mean(c(X2023.01.31, X2023.02.28, X2023.03.31, X2023.04.30, X2023.05.31, X2023.06.30, X2023.07.31, X2023.08.31, X2023.09.30, X2023.10.31, X2023.11.30, X2023.12.31), na.rm = TRUE),
    avg_2022_rent = mean(c(X2022.01.31, X2022.02.28, X2022.03.31, X2022.04.30, X2022.05.31, X2022.06.30, X2022.07.31, X2022.08.31, X2022.09.30, X2022.10.31, X2022.11.30, X2022.12.31), na.rm = TRUE),
    avg_2021_rent = mean(c(X2021.01.31, X2021.02.28, X2021.03.31, X2021.04.30, X2021.05.31, X2021.06.30, X2021.07.31, X2021.08.31, X2021.09.30, X2021.10.31, X2021.11.30, X2021.12.31), na.rm = TRUE),
    avg_2020_rent = mean(c(X2020.01.31, X2020.02.29, X2020.03.31, X2020.04.30, X2020.05.31, X2020.06.30, X2020.07.31, X2020.08.31, X2020.09.30, X2020.10.31, X2020.11.30, X2020.12.31), na.rm = TRUE),
    avg_2019_rent = mean(c(X2019.01.31, X2019.02.28, X2019.03.31, X2019.04.30, X2019.05.31, X2019.06.30, X2019.07.31, X2019.08.31, X2019.09.30, X2019.10.31, X2019.11.30, X2019.12.31), na.rm = TRUE),
    avg_2018_rent = mean(c(X2018.01.31, X2018.02.28, X2018.03.31, X2018.04.30, X2018.05.31, X2018.06.30, X2018.07.31, X2018.08.31, X2018.09.30, X2018.10.31, X2018.11.30, X2018.12.31), na.rm = TRUE),
    avg_2017_rent = mean(c(X2017.01.31, X2017.02.28, X2017.03.31, X2017.04.30, X2017.05.31, X2017.06.30, X2017.07.31, X2017.08.31, X2017.09.30, X2017.10.31, X2017.11.30, X2017.12.31), na.rm = TRUE),
    avg_2016_rent = mean(c(X2016.01.31, X2016.02.29, X2016.03.31, X2016.04.30, X2016.05.31, X2016.06.30, X2016.07.31, X2016.08.31, X2016.09.30, X2016.10.31, X2016.11.30, X2016.12.31), na.rm = TRUE),
    avg_2015_rent = mean(c(X2015.01.31, X2015.02.28, X2015.03.31, X2015.04.30, X2015.05.31, X2015.06.30, X2015.07.31, X2015.08.31, X2015.09.30, X2015.10.31, X2015.11.30, X2015.12.31), na.rm = TRUE),
    
    avg_population = mean(population, na.rm = TRUE),
    
    avg_historical_range_median_income = mean(c(median_income_2015, median_income_2016, median_income_2017, median_income_2018, median_income_2019, median_income_2020, median_income_2021, median_income_2022), na.rm = TRUE),
    avg_2015_median_income = mean(median_income_2015, na.rm = TRUE),
    avg_2016_median_income = mean(median_income_2016, na.rm = TRUE),
    avg_2017_median_income = mean(median_income_2017, na.rm = TRUE),
    avg_2018_median_income = mean(median_income_2018, na.rm = TRUE),
    avg_2019_median_income = mean(median_income_2019, na.rm = TRUE),
    avg_2020_median_income = mean(median_income_2020, na.rm = TRUE),
    avg_2021_median_income = mean(median_income_2021, na.rm = TRUE),
    avg_2022_median_income = mean(median_income_2022, na.rm = TRUE),
    
    avg_historical_range_median_age = mean(c(median_age_2015, median_age_2016, median_age_2017, median_age_2018, median_age_2019, median_age_2020, median_age_2021, median_age_2022), na.rm = TRUE),
    avg_2015_median_age = mean(median_age_2015, na.rm = TRUE),
    avg_2016_median_age = mean(median_age_2016, na.rm = TRUE),
    avg_2017_median_age = mean(median_age_2017, na.rm = TRUE),
    avg_2018_median_age = mean(median_age_2018, na.rm = TRUE),
    avg_2019_median_age = mean(median_age_2019, na.rm = TRUE),
    avg_2020_median_age = mean(median_age_2020, na.rm = TRUE),
    avg_2021_median_age = mean(median_age_2021, na.rm = TRUE),
    avg_2022_median_age = mean(median_age_2022, na.rm = TRUE),
    
    avg_historical_range_property_taxes = mean(c(property_taxes.x_2015, property_taxes.x_2016, property_taxes.x_2017, property_taxes.x_2018, property_taxes.x_2019, property_taxes.x_2020, property_taxes.x_2021, property_taxes.x_2022), na.rm = TRUE),
    avg_2015_property_taxes = mean(property_taxes.x_2015, na.rm = TRUE),
    avg_2016_property_taxes = mean(property_taxes.x_2016, na.rm = TRUE),
    avg_2017_property_taxes = mean(property_taxes.x_2017, na.rm = TRUE),
    avg_2018_property_taxes = mean(property_taxes.x_2018, na.rm = TRUE),
    avg_2019_property_taxes = mean(property_taxes.x_2019, na.rm = TRUE),
    avg_2020_property_taxes = mean(property_taxes.x_2020, na.rm = TRUE),
    avg_2021_property_taxes = mean(property_taxes.x_2021, na.rm = TRUE),
    avg_2022_property_taxes = mean(property_taxes.x_2022, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'CountyName', 'GEOID', 'lat'. You can
## override using the `.groups` argument.
state_fips <- "48"

counties_sf <- counties(state = state_fips, cb = TRUE, class = "sf") 
## Retrieving data for the year 2022
TX_county_aggregated_df$GEOID <- as.character(TX_county_aggregated_df$GEOID)

map_data <- counties_sf %>%
  left_join(TX_county_aggregated_df, by = "GEOID")

ggplot(map_data) +
  geom_sf(aes(fill = avg_historical_range_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  ggtitle("Average Historical Rent Price (2015-2024) by County (Texas)") +
  theme_minimal()

ggplot(map_data) +
  geom_sf(aes(fill = avg_2024_rent), color = "black") +
  scale_fill_viridis_c(name = "Average Rent ($)") +
  ggtitle("Average Rent Price in 2024 by County (Texas)") +
  theme_minimal()

Modeling for Rent in 2024

Let’s zoom back out the the country as a whole.

First, make a linear regression model just using the past years rent to see how the model performs in predicting the 2024 rent price for each county.

data$avg_rent_2024 <- rowMeans(data[, c('X2024.01.31', 'X2024.02.29', 'X2024.03.31', 'X2024.04.30', 'X2024.05.31', 'X2024.06.30', 'X2024.07.31', 'X2024.08.31', 'X2024.09.30', 'X2024.10.31', 'X2024.11.30', 'X2024.12.31')], na.rm = TRUE)
data$avg_rent_2023 <- rowMeans(data[, c('X2023.01.31', 'X2023.02.28', 'X2023.03.31', 'X2023.04.30', 'X2023.05.31', 'X2023.06.30', 'X2023.07.31', 'X2023.08.31', 'X2023.09.30', 'X2023.10.31', 'X2023.11.30', 'X2023.12.31')], na.rm = TRUE)
data$avg_rent_2022 <- rowMeans(data[, c('X2022.01.31', 'X2022.02.28', 'X2022.03.31', 'X2022.04.30', 'X2022.05.31', 'X2022.06.30', 'X2022.07.31', 'X2022.08.31', 'X2022.09.30', 'X2022.10.31', 'X2022.11.30', 'X2022.12.31')], na.rm = TRUE)
data$avg_rent_2021 <- rowMeans(data[, c('X2021.01.31', 'X2021.02.28', 'X2021.03.31', 'X2021.04.30', 'X2021.05.31', 'X2021.06.30', 'X2021.07.31', 'X2021.08.31', 'X2021.09.30', 'X2021.10.31', 'X2021.11.30', 'X2021.12.31')], na.rm = TRUE)
data$avg_rent_2020 <- rowMeans(data[, c('X2020.01.31', 'X2020.02.29', 'X2020.03.31', 'X2020.04.30', 'X2020.05.31', 'X2020.06.30', 'X2020.07.31', 'X2020.08.31', 'X2020.09.30', 'X2020.10.31', 'X2020.11.30', 'X2020.12.31')], na.rm = TRUE)
data$avg_rent_2019 <- rowMeans(data[, c('X2019.01.31', 'X2019.02.28', 'X2019.03.31', 'X2019.04.30', 'X2019.05.31', 'X2019.06.30', 'X2019.07.31', 'X2019.08.31', 'X2019.09.30', 'X2019.10.31', 'X2019.11.30', 'X2019.12.31')], na.rm = TRUE)
data$avg_rent_2018 <- rowMeans(data[, c('X2018.01.31', 'X2018.02.28', 'X2018.03.31', 'X2018.04.30', 'X2018.05.31', 'X2018.06.30', 'X2018.07.31', 'X2018.08.31', 'X2018.09.30', 'X2018.10.31', 'X2018.11.30', 'X2018.12.31')], na.rm = TRUE)
data$avg_rent_2017 <- rowMeans(data[, c('X2017.01.31', 'X2017.02.28', 'X2017.03.31', 'X2017.04.30', 'X2017.05.31', 'X2017.06.30', 'X2017.07.31', 'X2017.08.31', 'X2017.09.30', 'X2017.10.31', 'X2017.11.30', 'X2017.12.31')], na.rm = TRUE)
data$avg_rent_2016 <- rowMeans(data[, c('X2016.01.31', 'X2016.02.29', 'X2016.03.31', 'X2016.04.30', 'X2016.05.31', 'X2016.06.30', 'X2016.07.31', 'X2016.08.31', 'X2016.09.30', 'X2016.10.31', 'X2016.11.30', 'X2016.12.31')], na.rm = TRUE)
data$avg_rent_2015 <- rowMeans(data[, c('X2015.01.31', 'X2015.02.28', 'X2015.03.31', 'X2015.04.30', 'X2015.05.31', 'X2015.06.30', 'X2015.07.31', 'X2015.08.31', 'X2015.09.30', 'X2015.10.31', 'X2015.11.30', 'X2015.12.31')], na.rm = TRUE)

Make a new data frame selecting only the columns we want for modeling

data_modeling <- data[, c('avg_rent_2024', 'avg_rent_2023', 'avg_rent_2022', 'avg_rent_2021', 'avg_rent_2020', 'avg_rent_2019', 'avg_rent_2018', 'avg_rent_2017', 'avg_rent_2016', 'avg_rent_2015', 'population', 'property_taxes.x_2022', 'property_taxes.x_2021', 'property_taxes.x_2020', 'property_taxes.x_2019', 'property_taxes.x_2018', 'property_taxes.x_2017', 'property_taxes.x_2016', 'property_taxes.x_2015', 'median_income_2022', 'median_income_2021', 'median_income_2020', 'median_income_2019', 'median_income_2018', 'median_income_2017', 'median_income_2016', 'median_income_2015', 'county_state', 'StateName')]

Save the factors as needed.

data_modeling$county_state <- as.factor(data_modeling$county_state)
data_modeling$StateName <- as.factor(data_modeling$StateName)

Now we model! (Start with a linear model with all factors included)

lm_model_full <- lm(avg_rent_2024 ~ . -(county_state), data = data_modeling)
summary(lm_model_full)
## 
## Call:
## lm(formula = avg_rent_2024 ~ . - (county_state), data = data_modeling)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -92.95 -14.73   0.33  14.57  98.84 
## 
## Coefficients:
##                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)           -2.80e+01   1.37e+01   -2.04  0.04221 *  
## avg_rent_2023          1.26e+00   3.53e-02   35.67  < 2e-16 ***
## avg_rent_2022         -2.90e-01   4.86e-02   -5.97  4.2e-09 ***
## avg_rent_2021         -4.11e-02   4.19e-02   -0.98  0.32669    
## avg_rent_2020          4.71e-01   7.67e-02    6.14  1.5e-09 ***
## avg_rent_2019         -2.28e-01   8.24e-02   -2.77  0.00579 ** 
## avg_rent_2018         -2.00e-01   9.95e-02   -2.01  0.04484 *  
## avg_rent_2017          2.71e-01   1.32e-01    2.06  0.04019 *  
## avg_rent_2016         -4.71e-01   1.22e-01   -3.87  0.00012 ***
## avg_rent_2015          2.75e-01   6.14e-02    4.48  9.2e-06 ***
## population            -3.46e-06   8.23e-07   -4.20  3.0e-05 ***
## property_taxes.x_2022  3.96e-02   2.34e-02    1.70  0.09024 .  
## property_taxes.x_2021  1.65e-02   4.82e-02    0.34  0.73283    
## property_taxes.x_2020 -9.40e-02   4.80e-02   -1.96  0.05061 .  
## property_taxes.x_2019 -9.09e-02   5.71e-02   -1.59  0.11167    
## property_taxes.x_2018  1.23e-01   7.44e-02    1.65  0.09897 .  
## property_taxes.x_2017  1.41e-02   8.88e-02    0.16  0.87407    
## property_taxes.x_2016 -1.71e-04   9.28e-02    0.00  0.99853    
## property_taxes.x_2015 -4.67e-03   4.99e-02   -0.09  0.92539    
## median_income_2022    -2.40e-03   1.63e-03   -1.47  0.14221    
## median_income_2021     3.57e-03   2.64e-03    1.35  0.17695    
## median_income_2020    -3.75e-05   2.29e-03   -0.02  0.98694    
## median_income_2019    -4.61e-03   2.38e-03   -1.94  0.05334 .  
## median_income_2018     1.05e-03   2.79e-03    0.38  0.70525    
## median_income_2017     2.20e-03   3.09e-03    0.71  0.47699    
## median_income_2016    -4.24e-03   3.24e-03   -1.31  0.19057    
## median_income_2015     4.83e-03   2.17e-03    2.23  0.02634 *  
## StateNameAR            1.57e+01   1.76e+01    0.89  0.37265    
## StateNameAZ           -1.43e+01   1.36e+01   -1.05  0.29223    
## StateNameCA            2.26e+01   1.35e+01    1.67  0.09454 .  
## StateNameCO            7.19e+00   1.33e+01    0.54  0.58813    
## StateNameDC            2.01e+01   3.07e+01    0.66  0.51205    
## StateNameDE            4.31e+01   2.23e+01    1.94  0.05322 .  
## StateNameFL           -3.38e+00   1.24e+01   -0.27  0.78519    
## StateNameGA            1.01e+00   1.24e+01    0.08  0.93460    
## StateNameHI            5.02e+01   1.75e+01    2.87  0.00428 ** 
## StateNameIA           -9.68e+00   2.28e+01   -0.42  0.67102    
## StateNameID            1.39e+01   1.82e+01    0.76  0.44651    
## StateNameIL            3.55e+01   1.62e+01    2.19  0.02890 *  
## StateNameIN            2.37e+01   1.44e+01    1.65  0.09926 .  
## StateNameKS            2.83e+01   1.61e+01    1.76  0.07926 .  
## StateNameKY            3.57e+01   2.22e+01    1.61  0.10881    
## StateNameLA            2.87e+00   1.81e+01    0.16  0.87395    
## StateNameMA            2.00e+01   1.68e+01    1.19  0.23325    
## StateNameMD            1.68e+00   1.51e+01    0.11  0.91118    
## StateNameME            1.40e+01   3.00e+01    0.46  0.64213    
## StateNameMI            2.71e+01   1.41e+01    1.92  0.05524 .  
## StateNameMN           -1.04e+01   1.72e+01   -0.60  0.54777    
## StateNameMO            2.32e+01   1.60e+01    1.45  0.14883    
## StateNameMT            9.11e+00   2.29e+01    0.40  0.69118    
## StateNameNC           -4.03e+00   1.27e+01   -0.32  0.75052    
## StateNameNE            6.77e+00   2.30e+01    0.29  0.76902    
## StateNameNH            3.95e-01   3.13e+01    0.01  0.98994    
## StateNameNJ            7.99e+00   2.27e+01    0.35  0.72529    
## StateNameNM            8.84e+00   2.27e+01    0.39  0.69713    
## StateNameNV            2.02e+01   1.76e+01    1.15  0.25171    
## StateNameNY           -6.73e+00   1.93e+01   -0.35  0.72779    
## StateNameOH            3.22e+01   1.49e+01    2.16  0.03132 *  
## StateNameOK            1.51e+01   1.45e+01    1.04  0.29935    
## StateNameOR            1.48e+01   1.64e+01    0.90  0.36627    
## StateNamePA            9.12e+00   1.56e+01    0.59  0.55849    
## StateNameRI            5.91e+01   2.43e+01    2.43  0.01551 *  
## StateNameSC            1.88e+01   1.35e+01    1.39  0.16371    
## StateNameTN           -2.58e+00   1.45e+01   -0.18  0.85869    
## StateNameTX           -1.76e+01   1.35e+01   -1.30  0.19430    
## StateNameUT            9.84e+00   1.95e+01    0.50  0.61401    
## StateNameVA            3.41e+01   1.39e+01    2.46  0.01418 *  
## StateNameWA            2.60e+01   1.44e+01    1.80  0.07199 .  
## StateNameWI            4.96e+00   2.41e+01    0.21  0.83670    
## StateNameWY            1.71e+01   2.24e+01    0.76  0.44633    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.8 on 586 degrees of freedom
##   (2571 observations deleted due to missingness)
## Multiple R-squared:  0.999,  Adjusted R-squared:  0.999 
## F-statistic: 9.34e+03 on 69 and 586 DF,  p-value: <2e-16

Now use backwards selection.

# Use backwards selection
bwd_model <- step(lm_model_full, direction = "backward", trace = 0)
anova(bwd_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.50e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 3.15e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 1.04e+02 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.86e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.62e+00 0.10581    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.27e+01 0.00040 ***
## avg_rent_2016           1 2.27e+02 2.27e+02 3.20e-01 0.57191    
## avg_rent_2015           1 4.51e+04 4.51e+04 6.34e+01 8.4e-15 ***
## population              1 2.23e+04 2.23e+04 3.14e+01 3.2e-08 ***
## property_taxes.x_2022   1 2.10e+01 2.10e+01 3.00e-02 0.86317    
## property_taxes.x_2020   1 1.07e+04 1.07e+04 1.50e+01 0.00012 ***
## property_taxes.x_2019   1 3.75e+04 3.75e+04 5.28e+01 1.1e-12 ***
## property_taxes.x_2018   1 2.81e+04 2.81e+04 3.96e+01 6.1e-10 ***
## median_income_2022      1 1.40e+03 1.40e+03 1.96e+00 0.16166    
## median_income_2021      1 8.47e+03 8.47e+03 1.19e+01 0.00060 ***
## median_income_2019      1 1.77e+03 1.77e+03 2.49e+00 0.11511    
## median_income_2015      1 1.46e+04 1.46e+04 2.05e+01 7.2e-06 ***
## StateName              43 1.24e+05 2.89e+03 4.07e+00 3.8e-15 ***
## Residuals             595 4.23e+05 7.11e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now try removing factors one by one starting with the largest p-value each time.

new_model <- update(bwd_model, .~. -property_taxes.x_2022)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.35e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 3.08e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 1.01e+02 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.82e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.57e+00 0.10976    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.24e+01 0.00046 ***
## avg_rent_2016           1 2.27e+02 2.27e+02 3.10e-01 0.57621    
## avg_rent_2015           1 4.51e+04 4.51e+04 6.20e+01 1.6e-14 ***
## population              1 2.23e+04 2.23e+04 3.07e+01 4.5e-08 ***
## property_taxes.x_2020   1 2.67e+02 2.67e+02 3.70e-01 0.54478    
## property_taxes.x_2019   1 4.43e+04 4.43e+04 6.10e+01 2.6e-14 ***
## property_taxes.x_2018   1 1.39e+04 1.39e+04 1.91e+01 1.4e-05 ***
## median_income_2022      1 2.20e+03 2.20e+03 3.02e+00 0.08257 .  
## median_income_2021      1 7.53e+03 7.53e+03 1.04e+01 0.00135 ** 
## median_income_2019      1 3.13e+03 3.13e+03 4.30e+00 0.03850 *  
## median_income_2015      1 1.05e+04 1.05e+04 1.45e+01 0.00015 ***
## StateName              43 1.35e+05 3.13e+03 4.31e+00 < 2e-16 ***
## Residuals             596 4.33e+05 7.27e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(bwd_model, .~. -avg_rent_2016)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.36e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 3.09e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 1.02e+02 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.82e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.57e+00 0.10956    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.24e+01 0.00046 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.49e+01 0.00013 ***
## population              1 1.89e+04 1.89e+04 2.60e+01 4.6e-07 ***
## property_taxes.x_2022   1 5.00e+01 5.00e+01 7.00e-02 0.79288    
## property_taxes.x_2020   1 1.06e+04 1.06e+04 1.45e+01 0.00015 ***
## property_taxes.x_2019   1 3.86e+04 3.86e+04 5.32e+01 9.6e-13 ***
## property_taxes.x_2018   1 3.32e+04 3.32e+04 4.57e+01 3.3e-11 ***
## median_income_2022      1 3.50e+03 3.50e+03 4.82e+00 0.02857 *  
## median_income_2021      1 8.46e+03 8.46e+03 1.17e+01 0.00068 ***
## median_income_2019      1 1.83e+03 1.83e+03 2.53e+00 0.11256    
## median_income_2015      1 2.05e+04 2.05e+04 2.82e+01 1.6e-07 ***
## StateName              43 1.38e+05 3.22e+03 4.43e+00 < 2e-16 ***
## Residuals             596 4.33e+05 7.26e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2022)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.20e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 3.01e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.90e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.77e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.51e+00 0.11398    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.21e+01 0.00054 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.45e+01 0.00015 ***
## population              1 1.89e+04 1.89e+04 2.54e+01 6.3e-07 ***
## property_taxes.x_2020   1 3.51e+02 3.51e+02 4.70e-01 0.49261    
## property_taxes.x_2019   1 4.49e+04 4.49e+04 6.04e+01 3.5e-14 ***
## property_taxes.x_2018   1 1.68e+04 1.68e+04 2.26e+01 2.6e-06 ***
## median_income_2022      1 4.90e+03 4.90e+03 6.59e+00 0.01052 *  
## median_income_2021      1 7.48e+03 7.48e+03 1.01e+01 0.00160 ** 
## median_income_2019      1 3.30e+03 3.30e+03 4.43e+00 0.03570 *  
## median_income_2015      1 1.56e+04 1.56e+04 2.09e+01 5.7e-06 ***
## StateName              43 1.50e+05 3.49e+03 4.69e+00 < 2e-16 ***
## Residuals             597 4.44e+05 7.44e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2020)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.21e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 3.01e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.92e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.78e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.51e+00 0.11374    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.21e+01 0.00053 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.45e+01 0.00015 ***
## population              1 1.89e+04 1.89e+04 2.54e+01 6.2e-07 ***
## property_taxes.x_2019   1 9.26e+02 9.26e+02 1.25e+00 0.26471    
## property_taxes.x_2018   1 5.97e+04 5.97e+04 8.04e+01 < 2e-16 ***
## median_income_2022      1 4.75e+03 4.75e+03 6.40e+00 0.01169 *  
## median_income_2021      1 8.12e+03 8.12e+03 1.09e+01 0.00100 ** 
## median_income_2019      1 2.97e+03 2.97e+03 3.99e+00 0.04613 *  
## median_income_2015      1 1.61e+04 1.61e+04 2.17e+01 4.0e-06 ***
## StateName              43 1.51e+05 3.51e+03 4.72e+00 < 2e-16 ***
## Residuals             598 4.44e+05 7.43e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2019)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.06e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 2.94e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.68e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.73e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.45e+00 0.11815    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.18e+01 0.00062 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.42e+01 0.00018 ***
## population              1 1.89e+04 1.89e+04 2.48e+01 8.4e-07 ***
## property_taxes.x_2018   1 1.96e+03 1.96e+03 2.58e+00 0.10882    
## median_income_2022      1 2.12e+02 2.12e+02 2.80e-01 0.59782    
## median_income_2021      1 8.36e+03 8.36e+03 1.10e+01 0.00098 ***
## median_income_2019      1 2.32e+03 2.32e+03 3.04e+00 0.08152 .  
## median_income_2015      1 3.48e+04 3.48e+04 4.57e+01 3.3e-11 ***
## StateName              43 1.84e+05 4.28e+03 5.62e+00 < 2e-16 ***
## Residuals             599 4.56e+05 7.61e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -median_income_2022)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 6.05e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 2.94e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.66e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.73e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.44e+00 0.11850    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.18e+01 0.00063 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.42e+01 0.00019 ***
## population              1 1.89e+04 1.89e+04 2.48e+01 8.6e-07 ***
## property_taxes.x_2018   1 1.96e+03 1.96e+03 2.57e+00 0.10916    
## median_income_2021      1 4.07e+02 4.07e+02 5.30e-01 0.46564    
## median_income_2019      1 2.00e+00 2.00e+00 0.00e+00 0.96186    
## median_income_2015      1 3.93e+04 3.93e+04 5.15e+01 2.1e-12 ***
## StateName              43 1.88e+05 4.38e+03 5.74e+00 < 2e-16 ***
## Residuals             600 4.58e+05 7.63e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -median_income_2019)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 5.96e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 2.89e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.52e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.71e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.41e+00 0.12129    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.16e+01 0.00069 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.39e+01 0.00021 ***
## population              1 1.89e+04 1.89e+04 2.44e+01 1.0e-06 ***
## property_taxes.x_2018   1 1.96e+03 1.96e+03 2.54e+00 0.11184    
## median_income_2021      1 4.07e+02 4.07e+02 5.20e-01 0.46900    
## median_income_2015      1 1.70e+04 1.70e+04 2.19e+01 3.5e-06 ***
## StateName              43 2.03e+05 4.72e+03 6.09e+00 < 2e-16 ***
## Residuals             601 4.65e+05 7.74e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -median_income_2021)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 5.82e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 2.82e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.28e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.66e+02 < 2e-16 ***
## avg_rent_2018           1 1.86e+03 1.86e+03 2.35e+00 0.12589    
## avg_rent_2017           1 9.02e+03 9.02e+03 1.14e+01 0.00080 ***
## avg_rent_2015           1 1.08e+04 1.08e+04 1.36e+01 0.00025 ***
## population              1 1.89e+04 1.89e+04 2.38e+01 1.4e-06 ***
## property_taxes.x_2018   1 1.96e+03 1.96e+03 2.47e+00 0.11627    
## median_income_2015      1 2.72e+03 2.72e+03 3.42e+00 0.06472 .  
## StateName              43 2.05e+05 4.77e+03 6.01e+00 < 2e-16 ***
## Residuals             602 4.78e+05 7.94e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -avg_rent_2018)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 5.79e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 2.81e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.25e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.66e+02 < 2e-16 ***
## avg_rent_2017           1 1.88e+02 1.88e+02 2.40e-01   0.628    
## avg_rent_2015           1 1.75e+04 1.75e+04 2.19e+01 3.5e-06 ***
## population              1 2.07e+04 2.07e+04 2.60e+01 4.6e-07 ***
## property_taxes.x_2018   1 2.01e+03 2.01e+03 2.53e+00   0.112    
## median_income_2015      1 2.54e+03 2.54e+03 3.19e+00   0.075 .  
## StateName              43 2.05e+05 4.77e+03 5.98e+00 < 2e-16 ***
## Residuals             603 4.80e+05 7.97e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -avg_rent_2017)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                        Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023           1 4.62e+08 4.62e+08 5.67e+05 < 2e-16 ***
## avg_rent_2022           1 2.24e+05 2.24e+05 2.75e+02 < 2e-16 ***
## avg_rent_2020           1 7.37e+04 7.37e+04 9.05e+01 < 2e-16 ***
## avg_rent_2019           1 1.32e+05 1.32e+05 1.62e+02 < 2e-16 ***
## avg_rent_2015           1 6.64e+03 6.64e+03 8.16e+00  0.0044 ** 
## population              1 2.44e+04 2.44e+04 2.99e+01 6.5e-08 ***
## property_taxes.x_2018   1 8.70e+02 8.70e+02 1.07e+00  0.3015    
## median_income_2015      1 2.70e+03 2.70e+03 3.31e+00  0.0693 .  
## StateName              43 2.02e+05 4.70e+03 5.78e+00 < 2e-16 ***
## Residuals             604 4.92e+05 8.14e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
new_model <- update(new_model, .~. -property_taxes.x_2018)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                     Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023        1 4.62e+08 4.62e+08 5.67e+05 < 2e-16 ***
## avg_rent_2022        1 2.24e+05 2.24e+05 2.75e+02 < 2e-16 ***
## avg_rent_2020        1 7.37e+04 7.37e+04 9.05e+01 < 2e-16 ***
## avg_rent_2019        1 1.32e+05 1.32e+05 1.62e+02 < 2e-16 ***
## avg_rent_2015        1 6.64e+03 6.64e+03 8.15e+00  0.0044 ** 
## population           1 2.44e+04 2.44e+04 2.99e+01 6.6e-08 ***
## median_income_2015   1 3.56e+03 3.56e+03 4.37e+00  0.0369 *  
## StateName           43 2.01e+05 4.67e+03 5.73e+00 < 2e-16 ***
## Residuals          605 4.93e+05 8.15e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here, we decide to take median_income_2015 out becaue even though it has significance at the 0.05 level it does not make logical sense for this to contribute meaningfully when median incomes from other more recent years did not.

new_model <- update(new_model, .~. -median_income_2015)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_rent_2023   1 4.62e+08 4.62e+08 5.66e+05 < 2e-16 ***
## avg_rent_2022   1 2.24e+05 2.24e+05 2.75e+02 < 2e-16 ***
## avg_rent_2020   1 7.37e+04 7.37e+04 9.04e+01 < 2e-16 ***
## avg_rent_2019   1 1.32e+05 1.32e+05 1.62e+02 < 2e-16 ***
## avg_rent_2015   1 6.64e+03 6.64e+03 8.15e+00  0.0045 ** 
## population      1 2.44e+04 2.44e+04 2.99e+01 6.7e-08 ***
## StateName      43 2.03e+05 4.73e+03 5.80e+00 < 2e-16 ***
## Residuals     606 4.94e+05 8.15e+02                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here, again even though avg_rent_2015 is significant it does not make the most sense given more recent years were taken out so we will remove it.

new_model <- update(new_model, .~. -avg_rent_2015)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                Df   Sum Sq  Mean Sq  F value Pr(>F)    
## avg_rent_2023   1 3.78e+09 3.78e+09 1.18e+06 <2e-16 ***
## avg_rent_2022   1 1.65e+06 1.65e+06 5.17e+02 <2e-16 ***
## avg_rent_2020   1 7.90e+05 7.90e+05 2.47e+02 <2e-16 ***
## avg_rent_2019   1 1.27e+06 1.27e+06 3.97e+02 <2e-16 ***
## population      1 2.66e+05 2.66e+05 8.33e+01 <2e-16 ***
## StateName      48 1.64e+06 3.42e+04 1.07e+01 <2e-16 ***
## Residuals     982 3.14e+06 3.20e+03                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Here all factors are significant at the 0.05 level and make sense to be contributors. The fact that the avg_rent_2021 was taken out of the model could make sense due to COVID having an impact on what renters were leasing at the following year once rent freezes were no longer in effect in the wake of the immediate COVID crisis.

final_US_model <- new_model
anova(final_US_model)
## Analysis of Variance Table
## 
## Response: avg_rent_2024
##                Df   Sum Sq  Mean Sq  F value Pr(>F)    
## avg_rent_2023   1 3.78e+09 3.78e+09 1.18e+06 <2e-16 ***
## avg_rent_2022   1 1.65e+06 1.65e+06 5.17e+02 <2e-16 ***
## avg_rent_2020   1 7.90e+05 7.90e+05 2.47e+02 <2e-16 ***
## avg_rent_2019   1 1.27e+06 1.27e+06 3.97e+02 <2e-16 ***
## population      1 2.66e+05 2.66e+05 8.33e+01 <2e-16 ***
## StateName      48 1.64e+06 3.42e+04 1.07e+01 <2e-16 ***
## Residuals     982 3.14e+06 3.20e+03                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Intepretation:

Modeling the county-aggregated data

# First, drop NA's
county_aggregated_df_clean <- drop_na(county_aggregated_df)

fit_rent_only <- lm(avg_2024_rent ~ avg_2023_rent + avg_2022_rent + avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent + avg_2017_rent + avg_2016_rent + avg_2015_rent, data = county_aggregated_df_clean) 
summary(fit_rent_only)
## 
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent + 
##     avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent + 
##     avg_2017_rent + avg_2016_rent + avg_2015_rent, data = county_aggregated_df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -214.6  -27.9   -8.7   15.0  383.1 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   15.56680   15.45348    1.01     0.31    
## avg_2023_rent  1.35742    0.07030   19.31   <2e-16 ***
## avg_2022_rent -0.48856    0.09720   -5.03    9e-07 ***
## avg_2021_rent  0.11540    0.09352    1.23     0.22    
## avg_2020_rent  0.18077    0.20097    0.90     0.37    
## avg_2019_rent -0.17004    0.19494   -0.87     0.38    
## avg_2018_rent  0.04028    0.19740    0.20     0.84    
## avg_2017_rent -0.01551    0.17884   -0.09     0.93    
## avg_2016_rent -0.00901    0.12139   -0.07     0.94    
## avg_2015_rent  0.03256    0.07022    0.46     0.64    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 61.4 on 276 degrees of freedom
## Multiple R-squared:  0.986,  Adjusted R-squared:  0.986 
## F-statistic: 2.22e+03 on 9 and 276 DF,  p-value: <2e-16

Now use backwards selection to choose variables

# Use backwards selection
bwd_rent_only_model <- step(fit_rent_only, direction = "backward", trace = 0)
summary(bwd_rent_only_model)
## 
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent + 
##     avg_2021_rent, data = county_aggregated_df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -204.9  -28.8   -9.7   16.4  376.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    19.5348    13.4647    1.45     0.15    
## avg_2023_rent   1.3743     0.0677   20.29  < 2e-16 ***
## avg_2022_rent  -0.5265     0.0892   -5.90  1.0e-08 ***
## avg_2021_rent   0.1894     0.0444    4.27  2.7e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 60.9 on 282 degrees of freedom
## Multiple R-squared:  0.986,  Adjusted R-squared:  0.986 
## F-statistic: 6.76e+03 on 3 and 282 DF,  p-value: <2e-16

Let’s do the same thing using Lasso.

library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-8
library(coefplot)

## get X matrix and y
X <- as.matrix(county_aggregated_df_clean[, c('avg_2023_rent', 'avg_2022_rent', 'avg_2021_rent', 'avg_2020_rent', 'avg_2019_rent', 'avg_2018_rent', 'avg_2017_rent', 'avg_2016_rent', 'avg_2015_rent')]) # Exclude target, FIPS, and County columns
y <- county_aggregated_df_clean$avg_2024_rent  # Assuming target is stored in `target_variable`

cv_fit <- cv.glmnet(X, y, alpha = 1)

plot(cv_fit)

Now we will add other variables to the mix.

fit_all_info <- lm(avg_2024_rent ~ avg_2023_rent + avg_2022_rent + avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent + avg_2017_rent + avg_2016_rent + avg_2015_rent + avg_2015_median_income + avg_2016_median_income + avg_2016_median_income + avg_2017_median_income + avg_2018_median_income + avg_2019_median_income + avg_2020_median_income + avg_2021_median_income + avg_2022_median_income + avg_2015_property_taxes + avg_2016_property_taxes + avg_2017_property_taxes + avg_2018_property_taxes + avg_2019_property_taxes + avg_2020_property_taxes + avg_2021_property_taxes + avg_2022_property_taxes, data = county_aggregated_df_clean) 
summary(fit_all_info)
## 
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent + 
##     avg_2021_rent + avg_2020_rent + avg_2019_rent + avg_2018_rent + 
##     avg_2017_rent + avg_2016_rent + avg_2015_rent + avg_2015_median_income + 
##     avg_2016_median_income + avg_2016_median_income + avg_2017_median_income + 
##     avg_2018_median_income + avg_2019_median_income + avg_2020_median_income + 
##     avg_2021_median_income + avg_2022_median_income + avg_2015_property_taxes + 
##     avg_2016_property_taxes + avg_2017_property_taxes + avg_2018_property_taxes + 
##     avg_2019_property_taxes + avg_2020_property_taxes + avg_2021_property_taxes + 
##     avg_2022_property_taxes, data = county_aggregated_df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -204.0  -26.1   -6.9   15.6  389.3 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              6.31678   18.49975    0.34  0.73304    
## avg_2023_rent            1.23267    0.07355   16.76  < 2e-16 ***
## avg_2022_rent           -0.34017    0.10201   -3.33  0.00098 ***
## avg_2021_rent            0.15020    0.09664    1.55  0.12134    
## avg_2020_rent           -0.10523    0.20607   -0.51  0.61003    
## avg_2019_rent            0.10374    0.19773    0.52  0.60029    
## avg_2018_rent           -0.08393    0.19826   -0.42  0.67241    
## avg_2017_rent            0.16437    0.18768    0.88  0.38196    
## avg_2016_rent           -0.03120    0.12945   -0.24  0.80971    
## avg_2015_rent           -0.04574    0.07630   -0.60  0.54939    
## avg_2015_median_income   0.00387    0.00513    0.75  0.45121    
## avg_2016_median_income   0.00177    0.00809    0.22  0.82693    
## avg_2017_median_income  -0.00744    0.00829   -0.90  0.37048    
## avg_2018_median_income   0.00507    0.00709    0.71  0.47538    
## avg_2019_median_income  -0.00356    0.00580   -0.61  0.53956    
## avg_2020_median_income  -0.00762    0.00572   -1.33  0.18427    
## avg_2021_median_income   0.01485    0.00628    2.37  0.01876 *  
## avg_2022_median_income  -0.00637    0.00410   -1.55  0.12186    
## avg_2015_property_taxes -0.04285    0.13854   -0.31  0.75733    
## avg_2016_property_taxes  0.19511    0.25707    0.76  0.44856    
## avg_2017_property_taxes -0.14826    0.23841   -0.62  0.53456    
## avg_2018_property_taxes  0.07925    0.20797    0.38  0.70348    
## avg_2019_property_taxes -0.04094    0.15407   -0.27  0.79065    
## avg_2020_property_taxes  0.04837    0.13243    0.37  0.71520    
## avg_2021_property_taxes -0.09172    0.13513   -0.68  0.49791    
## avg_2022_property_taxes  0.01286    0.06411    0.20  0.84116    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 59.1 on 260 degrees of freedom
## Multiple R-squared:  0.988,  Adjusted R-squared:  0.987 
## F-statistic:  862 on 25 and 260 DF,  p-value: <2e-16

Now run backwards selection again

# Use backwards selection
bwd_all_info_model <- step(fit_all_info, direction = "backward", trace = 0)
summary(bwd_all_info_model)
## 
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent + 
##     avg_2021_rent + avg_2020_median_income + avg_2021_median_income + 
##     avg_2022_median_income + avg_2016_property_taxes + avg_2021_property_taxes, 
##     data = county_aggregated_df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -204.7  -27.0   -5.2   13.9  397.6 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              9.03080   15.82493    0.57  0.56869    
## avg_2023_rent            1.22629    0.07024   17.46  < 2e-16 ***
## avg_2022_rent           -0.31949    0.09297   -3.44  0.00068 ***
## avg_2021_rent            0.12840    0.04641    2.77  0.00605 ** 
## avg_2020_median_income  -0.00710    0.00364   -1.95  0.05242 .  
## avg_2021_median_income   0.01410    0.00575    2.45  0.01487 *  
## avg_2022_median_income  -0.00664    0.00354   -1.88  0.06138 .  
## avg_2016_property_taxes  0.06211    0.01263    4.92  1.5e-06 ***
## avg_2021_property_taxes -0.04991    0.01155   -4.32  2.2e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 57.8 on 277 degrees of freedom
## Multiple R-squared:  0.988,  Adjusted R-squared:  0.988 
## F-statistic: 2.82e+03 on 8 and 277 DF,  p-value: <2e-16

Now check this output using Anova

Take out the avg_2020_median_income

new_model <- update(bwd_all_info_model, .~. -avg_2020_median_income)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_2024_rent
##                          Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_2023_rent             1 75032525 75032525 22245.65 < 2e-16 ***
## avg_2022_rent             1    62408    62408    18.50 2.4e-05 ***
## avg_2021_rent             1    67512    67512    20.02 1.1e-05 ***
## avg_2021_median_income    1     3739     3739     1.11  0.2933    
## avg_2022_median_income    1    15427    15427     4.57  0.0333 *  
## avg_2016_property_taxes   1    27709    27709     8.22  0.0045 ** 
## avg_2021_property_taxes   1    60770    60770    18.02 3.0e-05 ***
## Residuals               278   937668     3373                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now take out avg_2021_median_income

new_model <- update(new_model, .~. -avg_2021_median_income)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_2024_rent
##                          Df   Sum Sq  Mean Sq  F value  Pr(>F)    
## avg_2023_rent             1 75032525 75032525 22150.36 < 2e-16 ***
## avg_2022_rent             1    62408    62408    18.42 2.4e-05 ***
## avg_2021_rent             1    67512    67512    19.93 1.2e-05 ***
## avg_2022_median_income    1     2757     2757     0.81   0.368    
## avg_2016_property_taxes   1    37264    37264    11.00   0.001 ** 
## avg_2021_property_taxes   1    60204    60204    17.77 3.4e-05 ***
## Residuals               279   945090     3387                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Now take out avg_2022_median_income

new_model <- update(new_model, .~. -avg_2022_median_income)
anova(new_model)
## Analysis of Variance Table
## 
## Response: avg_2024_rent
##                          Df   Sum Sq  Mean Sq F value  Pr(>F)    
## avg_2023_rent             1 75032525 75032525 22199.2 < 2e-16 ***
## avg_2022_rent             1    62408    62408    18.5 2.4e-05 ***
## avg_2021_rent             1    67512    67512    20.0 1.1e-05 ***
## avg_2016_property_taxes   1    39351    39351    11.6 0.00074 ***
## avg_2021_property_taxes   1    59574    59574    17.6 3.6e-05 ***
## Residuals               280   946390     3380                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Let’s call and save the final model we have come to.

final_US_fuller_model <- new_model
summary(final_US_fuller_model)
## 
## Call:
## lm(formula = avg_2024_rent ~ avg_2023_rent + avg_2022_rent + 
##     avg_2021_rent + avg_2016_property_taxes + avg_2021_property_taxes, 
##     data = county_aggregated_df_clean)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -211.6  -25.2   -4.8   15.6  393.9 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)              15.8349    13.1349    1.21  0.22901    
## avg_2023_rent             1.2222     0.0705   17.33  < 2e-16 ***
## avg_2022_rent            -0.3261     0.0931   -3.50  0.00053 ***
## avg_2021_rent             0.1415     0.0453    3.12  0.00197 ** 
## avg_2016_property_taxes   0.0606     0.0125    4.84  2.1e-06 ***
## avg_2021_property_taxes  -0.0472     0.0112   -4.20  3.6e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 58.1 on 280 degrees of freedom
## Multiple R-squared:  0.988,  Adjusted R-squared:  0.987 
## F-statistic: 4.45e+03 on 5 and 280 DF,  p-value: <2e-16

Interpretation:

Let’s Look on a State-by-State Basis Again

California!

APPENDIX

Get data from the Census buearu for each county from 2015-2022

library(tidycensus)
library(tidyverse)


#Requesting API key from the Census
census_api_key("8742799b6616820ce0d0a8dd45acd13c5b47fcc7", install = TRUE)


# Set Census API Key
readRenviron("~/.Renviron")  # Ensure the API key is loaded


# Define years from 2015 to the most recent available ACS data
years <- 2015:2022


# Retrieve Median Household Income
income_data <- purrr::map_dfr(years, function(y) {
 get_acs(
   geography = "county",
   variables = c(median_income = "B19013_001"),
   year = y,
   survey = "acs5"
 ) %>%
 mutate(year = y)
})


# Retrieve Median Age Data
age_data <- purrr::map_dfr(years, function(y) {
 get_acs(
   geography = "county",
   variables = c(median_age = "B01002_001"), 
   year = y,
   survey = "acs5"
 ) %>%
 mutate(year = y)
})


#Inner Joining the Two Sets
state_summary <- income_data %>%
 left_join(age_data, by = c("GEOID", "NAME", "year")) %>%
 group_by(GEOID, NAME, year) %>%
 summarise(
   median_income = mean(estimate.x, na.rm = TRUE), 
   median_age = mean(estimate.y, na.rm = TRUE),    
   .groups = "drop"
   )


# Retrieve Property Tax Data
property_tax_data <- purrr::map_dfr(years, function(y) {
  get_acs(
    geography = "county",
    variables = c(
      property_taxes = "B25103_001"  # Selected Monthly Owner Costs for Housing Units (includes property taxes)
    ),
    year = y,
    survey = "acs5"
  ) %>%
  mutate(year = y)
})


state_taxes_summary <- property_tax_data %>%
  group_by(GEOID, NAME, year) %>%
  summarise(property_taxes = mean(estimate, na.rm = TRUE), .groups = "drop")






state_summary <- state_summary %>% 
  left_join(state_taxes_summary, by = c("GEOID", "NAME", "year"))

# Reshape data to have one row per county, separate columns for each year's median income & age
state_summary_wide <- state_summary %>%
  pivot_wider(names_from = year, 
              values_from = c(median_income, median_age, property_taxes.x), 
              names_glue = "{.value}_{year}")

Fix the formatting of the income data to separate the state and county as well as add the stae abbreviation code.

county_data <- read.csv("data/state_summary_wide.csv")

library(tidyverse)

county_data <- county_data %>%
  separate(NAME, into = c("county", "state"), sep = ", ")


state_abbreviations <- data.frame(
  state = c("Alabama", "Alaska", "Arizona", "Arkansas", "California", "Colorado", "Connecticut", 
            "Delaware","District of Columbia", "Florida", "Georgia", "Hawaii", "Idaho", "Illinois", "Indiana", "Iowa", 
            "Kansas", "Kentucky", "Louisiana", "Maine", "Maryland", "Massachusetts", "Michigan", 
            "Minnesota", "Mississippi", "Missouri", "Montana", "Nebraska", "Nevada", "New Hampshire", 
            "New Jersey", "New Mexico", "New York", "North Carolina", "North Dakota", "Ohio", "Oklahoma", 
            "Oregon", "Pennsylvania", "Rhode Island", "South Carolina", "South Dakota", "Tennessee", 
            "Texas", "Utah", "Vermont", "Virginia", "Washington", "West Virginia", "Wisconsin", "Wyoming"),
  abbreviation = c("AL", "AK", "AZ", "AR", "CA", "CO", "CT", "DE","DC", "FL", "GA", "HI", "ID", "IL", "IN", "IA", 
                   "KS", "KY", "LA", "ME", "MD", "MA", "MI", "MN", "MS", "MO", "MT", "NE", "NV", "NH", "NJ", 
                   "NM", "NY", "NC", "ND", "OH", "OK", "OR", "PA", "RI", "SC", "SD", "TN", "TX", "UT", "VT", 
                   "VA", "WA", "WV", "WI", "WY")
)

county_data <- county_data %>%
  left_join(state_abbreviations, by = "state")

write.csv(county_data, "state_income_summary_updated.csv", row.names = FALSE)

Load in the data

zillow <- read.csv("data/City_zori_uc_sfrcondomfr_sm_month.csv")

county_geo <- read.csv("data/uscounties.csv")

county_data <- read.csv("data/state_income_summary_updated.csv")

Inner join the data frames

df <- inner_join(county_data, county_geo, by = c("GEOID" = "county_fips"))
colnames(df)[colnames(df) == "county.x"] <- "county_fullname"

df <- df %>%
  mutate(county_state = paste(county_fullname, state_id))
zillow <- zillow %>%
  mutate(county_state = paste(CountyName, StateName))

data <- inner_join(zillow, df, by = "county_state")
data_cleaned <- data %>% distinct(RegionID, RegionName, X2015.01.31, .keep_all = TRUE)

data_cleaned <- data_cleaned %>%
  select(-c(RegionType, state_name, abbreviation, county.y, county_full, county_fullname, state_id, State))

colnames(data_cleaned)[colnames(data_cleaned) == "county_ascii"] <- "county"

write.csv(data_cleaned, "combined_zillow_project_data.csv", row.names = FALSE)